/*
  author Sylvain Bertrand <sylvain.bertrand@gmail.com>
  Protected by linux GNU GPLv2
  Copyright 2012-2014
*/
#include <linux/pci.h>
#include <asm/byteorder.h>
#include <linux/delay.h>
#include <linux/cdev.h>
#include <linux/vmalloc.h>

#include <alga/alga.h>
#include <alga/rng_mng.h>
#include <alga/timing.h>
#include <uapi/alga/pixel_fmts.h>
#include <uapi/alga/amd/dce6/dce6.h>

#include "mc.h"
#include "ih.h"
#include "rlc.h"
#include "fence.h"
#include "ring.h"
#include "dmas.h"
#include "ba.h"
#include "cps.h"
#include "gpu.h"
#include "drv.h"

#include "regs.h"

#include "hdp.h"

#include "ba_private.h"
#include "mapping_core_coherent.h"
#include "mapping_core_sg_kernel.h"
#include "mapping_sg_user.h"

void tlb_flush(struct pci_dev *dev)
{
	hdp_cache_flush(dev);

	/* bits 0-15 are for the vm contexts 0-15 */
	wr32(dev, 1, VM_INVALIDATE_REQ);
}

static long dummy_page_alloc(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	dd->ba.dummy_cpu_addr = dma_zalloc_coherent(&dev->dev, GPU_PAGE_SZ,
					&dd->ba.dummy_bus_addr, GFP_KERNEL);
	if (!dd->ba.dummy_cpu_addr) {
		dev_err(&dev->dev, "ba:unable to allocate dummy page\n");
		return -BA_ERR;
	}

	if (!IS_GPU_PAGE_ALIGNED(dd->ba.dummy_bus_addr)) {
		dev_err(&dev->dev, "ba:dummy page bus addr not aligned on GPU page\n");
		dma_free_coherent(&dev->dev, GPU_PAGE_SZ, dd->ba.dummy_cpu_addr,
							dd->ba.dummy_bus_addr);
		dd->ba.dummy_cpu_addr = NULL;
		return -BA_ERR;
	}
	dev_info(&dev->dev, "ba:dummy page mapped cpu_addr=0x%p bus_addr=0x%016llx\n",
				dd->ba.dummy_cpu_addr, dd->ba.dummy_bus_addr);
	return 0;
}

static void pt_init(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u64 ba_end;
	u8 i;

	dd = pci_get_drvdata(dev);

	ba_end = dd->ba.mng.s + dd->ba.mng.sz - 1;

	/* setup tlb control, MX, X is generic for various client blocks */
	wr32(dev, (0xa << 7)/* ??? */
		| MVMLTC_ENA_L1_TLB
		| set(MVMLTC_SYS_ACCESS_MODE, MVMLTC_MAPPED_ACCESS_NOT_IN_SYS)
		| MVMLTC_ENA_ADVANCED_DRIVER_MODEL
		| set(MVMLTC_SYS_APER_UNMAPPED_ACCESS, MVMLTC_PASS_THRU),
							MC_VM_MX_L1_TLB_CTL);

	/* setup l2 cache */
	wr32(dev, VLC_ENA_L2_CACHE | VLC_ENA_L2_PTE_CACHE_LRU_UPDATE_BY_WR
					| VLC_ENA_L2_PDE0_CACHE_LRU_UPDATE_BY_WR
					| set(VLC_EFFECTIVE_L2_QUEUE_SZ, 7)
					| set(VLC_CTX1_IDENTITY_ACCESS_MODE, 1),
								 VM_L2_CTL_0);
	wr32(dev, VLC_INVALIDATE_ALL_L1_TLBS | VLC_INVALIDATE_L2_CACHE,
								VM_L2_CTL_1);
	wr32(dev, VLC_L2_CACHE_BIGK_ASSOCIATIVITY
			| set(VLC_L2_CACHE_BIGK_FRAGMENT_SZ, 0), VM_L2_CTL_2);

	/* ctx0 */
	wr32(dev, GPU_PAGE_IDX(dd->ba.mng.s), VM_CTX_0_PT_START_ADDR);
	wr32(dev, GPU_PAGE_IDX(ba_end), VM_CTX_0_PT_END_ADDR);
	wr32(dev, GPU_PAGE_IDX(dd->ba.pt_start), VM_CTX_0_PT_BASE_ADDR);
	wr32(dev, GPU_PAGE_IDX(dd->ba.dummy_bus_addr),
					VM_CTX_0_PROTECTION_FAULT_DEFAULT_ADDR);
	wr32(dev, 0, VM_CTX_0_CTL_1);
	wr32(dev, VCC_ENA_CTX | set(VCC_PT_DEPTH, 0)
			| VCC_RNG_PROTECTION_FAULT_ENA_DEFAULT, VM_CTX_0_CTL_0);

	/* ??? */
	wr32(dev, 0, 0x15d4);
	wr32(dev, 0, 0x15d8);
	wr32(dev, 0, 0x15dc);

	/* ctx 1 register is actually the settings for ctx 1-15 */
	wr32(dev, 0, VM_CTX_1_PT_START_ADDR);
	wr32(dev, 1 << 20, VM_CTX_1_PT_END_ADDR);
	for (i = 1; i < VM_CTXS_N; ++i)
		/* fake page table */
		wr32(dev, GPU_PAGE_IDX(dd->ba.pt_start),
						regs_vm_ctx_pt_base_addr[i]);
	wr32(dev, GPU_PAGE_IDX(dd->ba.dummy_bus_addr),
					VM_CTX_1_PROTECTION_FAULT_DEFAULT_ADDR);
	wr32(dev, 0, VM_CTX_1_CTL_1);
	wr32(dev, VCC_ENA_CTX | set(VCC_PT_DEPTH, 0)
			| VCC_RNG_PROTECTION_FAULT_ENA_DEFAULT, VM_CTX_1_CTL_0);

	tlb_flush(dev);
}

long ba_init_once(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	dd = pci_get_drvdata(dev);

	init_rwsem(&dd->ba.maps_sem);
	return dummy_page_alloc(dev);
}

#define PT_ALIGN  (32 * 1024)
long ba_init(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u64 i;
	u64 dummy_pte;
	long r;
	u64 pt_addr;
	u64 ba_start;
	u64 aligned_pt_sz;
	u64 ba_sz;

	dd = pci_get_drvdata(dev);

	/*
	 * The bus aperture is the size of vram plus wb page and rings. Upstream
	 * code (specs) tells us it must be a power of 2, maybe for old asics?
	 * Some boards have garbage in upper 16 bits of CFG_MEM_SZ.
	 */
	ba_sz = (rr32(dev, CFG_MEM_SZ) & 0xffff) * 1024 * 1024
			+ GPU_PAGE_SZ /* wb page */
			+ (1 << IH_RING_LOG2_DWS) * 4 /* IH ring */
			+ 3 * (1 << CP_RING_LOG2_QWS) * 8; /* 3 command rings */

	if (!IS_GPU_PAGE_ALIGNED(ba_sz)) {
		dev_err(&dev->dev, "ba:aperture size (vram size) is not aligned on gpu page size\n");
		return -BA_ERR;
	}

	dd->ba.ptes_n = GPU_PAGE_IDX(ba_sz);

	/* aperture after vram in gpu address space */
	ba_start = rng_align(dd->vram.mng.s + dd->vram.mng.sz, GPU_PAGE_SZ);

	/* the pt must be 32kB aligned, we do align the booking sz too */
	aligned_pt_sz = dd->ba.ptes_n * PTE_SZ;
	aligned_pt_sz = ((aligned_pt_sz / PT_ALIGN) + 1) * PT_ALIGN;
	r = rng_alloc_align(&dd->ba.pt_start, &dd->vram.mng, aligned_pt_sz,
								PT_ALIGN);
	if (r == -ALGA_ERR) {
		dev_err(&dev->dev, "ba:unable to allocate vram for page table entries\n");
		return -BA_ERR;
	}

	rng_mng_init(&dd->ba.mng, ba_start, ba_sz);
	INIT_LIST_HEAD(&dd->ba.maps);

	pt_addr = dd->ba.pt_start;
	dummy_pte = dd->ba.dummy_bus_addr | PTE_VALID | PTE_SYSTEM
				| PTE_SNOOPED | PTE_READABLE | PTE_WRITEABLE;

	for (i = 0; i < dd->ba.ptes_n; ++i) {
		/* 64 bits little endian */
		vram_w32(dev, lower_32_bits(dummy_pte), dd->ba.pt_start + i
								* PTE_SZ);
		vram_w32(dev, upper_32_bits(dummy_pte), dd->ba.pt_start  + i
							* PTE_SZ + sizeof(u32));
	}

	/* make sure the gpu pte updates where sent over the bus */
	wmb();

	tlb_flush(dev);

	pt_init(dev);
	dev_info(&dev->dev, "ba:page table at 0x%016llx aperture size=0x%016llx (ptes_n=0x%016llx)\n",
					dd->ba.pt_start, ba_sz, dd->ba.ptes_n);
	return 0;
}
#undef PT_ALIGN

static void pt_shutdown(struct pci_dev *dev)
{
	/* disable all tables */
	wr32(dev, 0, VM_CTX_0_CTL_0);
	wr32(dev, 0, VM_CTX_1_CTL_0);

	/* setup tlb control */
	wr32(dev, set(MVMLTC_SYS_ACCESS_MODE, MVMLTC_MAPPED_ACCESS_NOT_IN_SYS)
		| set(MVMLTC_SYS_APER_UNMAPPED_ACCESS, MVMLTC_PASS_THRU),
							MC_VM_MX_L1_TLB_CTL);

	/* setup l2 cache */
	wr32(dev, VLC_ENA_L2_PTE_CACHE_LRU_UPDATE_BY_WR
					| VLC_ENA_L2_PDE0_CACHE_LRU_UPDATE_BY_WR
					| set(VLC_EFFECTIVE_L2_QUEUE_SZ, 7)
					| set(VLC_CTX1_IDENTITY_ACCESS_MODE, 1),
								 VM_L2_CTL_0);
	wr32(dev, 0, VM_L2_CTL_1);
	wr32(dev, VLC_L2_CACHE_BIGK_ASSOCIATIVITY
			| set(VLC_L2_CACHE_BIGK_FRAGMENT_SZ, 0), VM_L2_CTL_2);

	tlb_flush(dev);
}

void ba_shutdown(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	pt_shutdown(dev);

	dd = pci_get_drvdata(dev);

	rng_mng_destroy(&dd->ba.mng);

	rng_free(&dd->vram.mng, dd->ba.pt_start);
}


void pte_mmio_regs_install(struct pci_dev *dev, u64 pte_addr, u64 bus_addr)
{
	/* 64 bits little endian */
	vram_w32(dev, lower_32_bits(bus_addr | PTE_VALID | PTE_SYSTEM
			| PTE_SNOOPED | PTE_READABLE | PTE_WRITEABLE), pte_addr);
	vram_w32(dev, upper_32_bits(bus_addr | PTE_VALID | PTE_SYSTEM
	| PTE_SNOOPED | PTE_READABLE | PTE_WRITEABLE), pte_addr + sizeof(u32));
}

static void cleanup(struct pci_dev *dev, struct ba_map *m, u8 flgs)
{
	switch (m->type) {
	case BA_MAP_COHERENT_CONTIG:
		core_coherent_cleanup(dev, m, flgs);
		break;
	case BA_MAP_KERNEL_SG:
		core_sg_kernel_cleanup(dev, m, flgs);
		break;
	case BA_MAP_USER_SG:
		sg_user_cleanup(dev, m, flgs);
		break;
	}
}

static void unmap(struct pci_dev *dev, struct ba_map *m, u8 flgs)
{
	struct ba_map *pos;
	struct ba_map *tmp;
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	list_for_each_entry_safe(pos, tmp, &dd->ba.maps, n) {
		if (pos == m) {
			list_del(&pos->n);
			cleanup(dev, pos, flgs);
			break;
		}
	}

	if (flgs & BA_NO_PT_UPDATE)
		return;

	/* make sure the gpu pte updates were sent over the bus */
	wmb();

	/* flush tlb to make live the restored dummy pages */
	tlb_flush(dev);
}


static int wb_map(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	dev_info(&dev->dev, "ba:mapping write back page bus aperture...\n");
	return core_coherent_map(dev, GPU_PAGE_SZ, &dd->ba.wb_map);
}

static long ih_ring_map(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u64 ih_ring_sz;

	ih_ring_sz = (1 << IH_RING_LOG2_DWS) * 4;

	dd = pci_get_drvdata(dev);

	dev_info(&dev->dev, "ba:mapping ih ring bus aperture...\n");
	return core_coherent_map(dev, ih_ring_sz, &dd->ba.ih_ring_map);
}

static long cp_ring_map(struct pci_dev *dev, u8 i, struct ba_map **m)
{
	struct dev_drv_data *dd;
	u64 cp_ring_sz;

	cp_ring_sz = (1 << CP_RING_LOG2_QWS) * 8;

	dd = pci_get_drvdata(dev);

	dev_info(&dev->dev, "ba:mapping cp%u ring bus aperture...\n", i);
	return core_coherent_map(dev, cp_ring_sz, m);
}

static long dma_ring_map(struct pci_dev *dev, u8 i)
{
	struct dev_drv_data *dd;
	u64 dma_ring_sz;

	dma_ring_sz = (1 << DMA_RING_LOG2_DWS) * 4;

	dd = pci_get_drvdata(dev);

	dev_info(&dev->dev, "ba:mapping dma%u ring bus aperture...\n", i);
	return core_coherent_map(dev, dma_ring_sz, &dd->ba.dma_rings_maps[i]);
}

static long pt_map(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u64 ba_ptes_sz;
	u64 map_sz;

	dd = pci_get_drvdata(dev);

	/*
	 * compute the mapping size which will be aligned on GPU_PAGE_SZ and
	 * will embed the whole pt
	 */
	ba_ptes_sz = dd->ba.ptes_n * PTE_SZ;
	map_sz = IS_GPU_PAGE_ALIGNED(ba_ptes_sz) ? GPU_PAGE_MASK(ba_ptes_sz) :
				GPU_PAGE_MASK(ba_ptes_sz) + GPU_PAGE_SZ;

	dev_info(&dev->dev, "ba:mapping vm0 page table aperture...\n");
	return core_sg_kernel_map(dev, map_sz, &dd->ba.pt_map);
}

long ba_core_map(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	long r;

	dd = pci_get_drvdata(dev);

	r = wb_map(dev);
	if (r == -BA_ERR)
		goto err;

	r = ih_ring_map(dev);
	if (r == -BA_ERR)
		goto err_unmap_wb;

	r = cp_ring_map(dev, 0, &dd->ba.gpu_3d_ring_map);
	if (r == -BA_ERR)
		goto err_unmap_ih_ring;

	r = cp_ring_map(dev, 1, &dd->ba.gpu_c0_ring_map);
	if (r == -BA_ERR)
		goto err_unmap_gpu_3d_ring;

	r = cp_ring_map(dev, 2, &dd->ba.gpu_c1_ring_map);
	if (r == -BA_ERR)
		goto err_unmap_gpu_c0_ring;

	r = dma_ring_map(dev, 0);
	if (r == -BA_ERR)
		goto err_unmap_gpu_c1_ring;

	r = dma_ring_map(dev, 1);
	if (r == -BA_ERR)
		goto err_unmap_dma0_ring;

	r = pt_map(dev);
	if (r == -BA_ERR)
		goto err_unmap_dma1_ring;
	return 0;

err_unmap_dma1_ring:
	unmap(dev, dd->ba.dma_rings_maps[1], 0);

err_unmap_dma0_ring:
	unmap(dev, dd->ba.dma_rings_maps[0], 0);

err_unmap_gpu_c1_ring:
	unmap(dev, dd->ba.gpu_c1_ring_map, 0);

err_unmap_gpu_c0_ring:
	unmap(dev, dd->ba.gpu_c0_ring_map, 0);

err_unmap_gpu_3d_ring:
	unmap(dev, dd->ba.gpu_3d_ring_map, 0);

err_unmap_ih_ring:
	unmap(dev, dd->ba.ih_ring_map, 0);

err_unmap_wb:
	unmap(dev, dd->ba.wb_map, 0);
err:
	return -BA_ERR;
}

/*
 * cpu_addr has only one purpose, to link the map with its vm_area_struct.
 * We put  vm_start in there.
 */
long ba_map(struct pci_dev *dev, void __iomem *cpu_addr,
					struct sg_table *sg_tbl, int nents)
{
	return sg_user_map(dev, cpu_addr, sg_tbl, nents);
}

/*
 * If the map is not found, just ignore. That helps for suspend/resume when all
 * gpu mappings are gone.
 */
void ba_unmap(struct pci_dev *dev, void __iomem *cpu_addr)
{
	struct ba_map *pos;
	struct ba_map *tmp;
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	down_write(&dd->ba.maps_sem);
	list_for_each_entry_safe(pos, tmp, &dd->ba.maps, n) {
		if (pos->cpu_addr == cpu_addr) {
			list_del(&pos->n);
			sg_user_cleanup(dev, pos, 0);
			break;
		}
	}
	up_write(&dd->ba.maps_sem);
}

void ba_all_unmap(struct pci_dev *dev, u8 flgs)
{
	struct dev_drv_data *dd;
	struct ba_map *pos;
	struct ba_map *tmp;

	dd = pci_get_drvdata(dev);

	list_for_each_entry_safe(pos, tmp, &dd->ba.maps, n) {
		unmap(dev, pos, flgs);
	}
}
